Github version
install.packages("drat", repos="https://cran.rstudio.com")
drat:::addRepo("dmlc")
install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")CRAN version
install.packages("xgboost")require(xgboost)
搼㸴搼㸸挼㸸攼㹢搼㸰攼㸸Ҫ戼㸵ij̼愼㹤戼㸰昼㹣愼㸳戼㹡xgboost
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train <- agaricus.train
test <- agaricus.test
data和label两个元素str(train)
List of 2
$ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
.. ..@ i : int [1:143286] 2 6 8 11 18 20 21 24 28 32 ...
.. ..@ p : int [1:127] 0 369 372 3306 5845 6489 6513 8380 8384 10991 ...
.. ..@ Dim : int [1:2] 6513 126
.. ..@ Dimnames:List of 2
.. .. ..$ : NULL
.. .. ..$ : chr [1:126] "cap-shape=bell" "cap-shape=conical" "cap-shape=convex" "cap-shape=flat" ...
.. ..@ x : num [1:143286] 1 1 1 1 1 1 1 1 1 1 ...
.. ..@ factors : list()
$ label: num [1:6513] 1 0 0 1 0 0 0 1 0 0 ...
dim(train$data)
[1] 6513 126
dim(test$data)
[1] 1611 126
data以稀疏矩阵的形式储存在dgCMatrix中,label是{0,1}数值向量class(train$data)[1]
[1] "dgCMatrix"
class(train$label)
[1] "numeric"
objective = "binary:logistic":训练二元分类模型max_depth = 2:由于数据集很小,树的最大深度设为2nthread = 2:使用2个cpu线程nrounds = 2:训练2轮模型bstSparse <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
[1] train-error:0.046522
[2] train-error:0.022263
bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
[1] train-error:0.046522
[2] train-error:0.022263
xgb.DMatrix形式输入:dtrain <- xgb.DMatrix(data = train$data, label = train$label)
bstDMatrix <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
[1] train-error:0.046522
[2] train-error:0.022263
# verbose = 0, no message
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
# verbose = 1, print evaluation metric
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 1)
[1] train-error:0.046522
[2] train-error:0.022263
# verbose = 2, also print information about tree
bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 2)
[18:21:12] amalgamation/../src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2
[1] train-error:0.046522
[18:21:12] amalgamation/../src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2
[2] train-error:0.022263
test数据集进行预测:pred <- predict(bst, test$data)
# size of the prediction vector
print(length(pred))
[1] 1611
# limit display of predictions to the first 10
print(head(pred))
[1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873 0.92392391
predict输出的是概率,将其转换为{0,1}:prediction <- as.numeric(pred > 0.5)
print(head(prediction))
[1] 0 1 0 0 0 1
err <- mean(as.numeric(pred > 0.5) != test$label)
print(paste("test-error=", err))
[1] "test-error= 0.0217256362507759"
dtrain <- xgb.DMatrix(data = train$data, label=train$label)
dtest <- xgb.DMatrix(data = test$data, label=test$label)
xgb.train来衡量模型训练的过程,利用watchlist参数,防止过多的训练轮次导致过拟合:watchlist <- list(train=dtrain, test=dtest)
bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic")
[1] train-error:0.046522 test-error:0.042831
[2] train-error:0.022263 test-error:0.021726
xgb.csv配合early_stopping_rounds参数防止过拟合:params = list(
objective = "binary:logistic",
eta=1,
max_depth=2,
nthread=2
)
bst <- xgb.cv(params=params, data=dtrain, nrounds=200, nfold=5, early_stopping_rounds=5, metrics = list("auc"))
[1] train-auc:0.958221+0.001361 test-auc:0.958115+0.005430
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 5 rounds.
[2] train-auc:0.981411+0.000870 test-auc:0.981378+0.003471
[3] train-auc:0.997073+0.000346 test-auc:0.997112+0.001378
[4] train-auc:0.998757+0.000109 test-auc:0.998757+0.000454
[5] train-auc:0.999299+0.000104 test-auc:0.999310+0.000414
[6] train-auc:0.999504+0.000070 test-auc:0.999466+0.000421
[7] train-auc:0.999673+0.000067 test-auc:0.999631+0.000295
[8] train-auc:0.999757+0.000183 test-auc:0.999621+0.000380
[9] train-auc:0.999912+0.000076 test-auc:0.999902+0.000079
[10] train-auc:0.999979+0.000023 test-auc:0.999965+0.000036
[11] train-auc:1.000000+0.000000 test-auc:1.000000+0.000000
[12] train-auc:1.000000+0.000000 test-auc:1.000000+0.000000
[13] train-auc:1.000000+0.000000 test-auc:1.000000+0.000000
[14] train-auc:1.000000+0.000000 test-auc:1.000000+0.000000
[15] train-auc:1.000000+0.000000 test-auc:1.000000+0.000000
[16] train-auc:1.000000+0.000000 test-auc:1.000000+0.000000
Stopping. Best iteration:
[11] train-auc:1.000000+0.000000 test-auc:1.000000+0.000000
eval_metric设置多种衡量模型效果的方式:bst <- xgb.train(data=dtrain, max_depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, eval_metric = "error", eval_metric = "logloss", objective = "binary:logistic")
[1] train-error:0.046522 train-logloss:0.233376 test-error:0.042831 test-logloss:0.226686
[2] train-error:0.022263 train-logloss:0.136658 test-error:0.021726 test-logloss:0.137874
之前的XGBoost都是基于提升树,除此之外,还可以建立基于线性提升器的模型。设置参数booster = "gblinear",同时移除eta参数:
bst <- xgb.train(data=dtrain, booster = "gblinear", max_depth=2, nthread = 2, nrounds=2, watchlist=watchlist, eval_metric = "error", eval_metric = "logloss", objective = "binary:logistic")
[1] train-error:0.028405 train-logloss:0.189791 test-error:0.023588 test-logloss:0.186855
[2] train-error:0.004453 train-logloss:0.072454 test-error:0.004345 test-logloss:0.071301
对于小的数据集,由于真实情况可能是线性可分的,线性提升器的效果可能更好。所以在建模时,建议尝试两种提升器并比较效果。
xgb.DMatrixxgb.DMatrix.save存储xgb.DMatrix对象xgb.DMatrix.save(dtrain, "dtrain.buffer")
[1] TRUE
xgb.DMatrix载入xgb.DMatrix对象# to load it in, simply call xgb.DMatrix
dtrain2 <- xgb.DMatrix("dtrain.buffer")
[18:21:13] 6513x126 matrix with 143286 entries loaded from dtrain.buffer
bst <- xgb.train(data=dtrain2, max_depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic")
[1] train-error:0.046522 test-error:0.042831
[2] train-error:0.022263 test-error:0.021726
xgb.DMatrix对象file.remove("dtrain.buffer")
[1] TRUE
getinfo提取xgb.DMatrix对象中的信息(‘label’, ‘weight’, ‘base_margin’, ‘nrow’)label = getinfo(dtest, "label")
pred <- predict(bst, dtest)
err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
print(paste("test-error=", err))
[1] "test-error= 0.0217256362507759"
importance_matrix <- xgb.importance(model = bst)
print(importance_matrix)
xgb.plot.importance(importance_matrix = importance_matrix)
xgb.dump将学得的模型导入text文件xgb.dump(bst, with_stats = T)
[1] "booster[0]"
[2] "0:[f28<-9.53674e-007] yes=1,no=2,missing=1,gain=4000.53,cover=1628.25"
[3] "1:[f55<-9.53674e-007] yes=3,no=4,missing=3,gain=1158.21,cover=924.5"
[4] "3:leaf=1.71218,cover=812"
[5] "4:leaf=-1.70044,cover=112.5"
[6] "2:[f108<-9.53674e-007] yes=5,no=6,missing=5,gain=198.174,cover=703.75"
[7] "5:leaf=-1.94071,cover=690.5"
[8] "6:leaf=1.85965,cover=13.25"
[9] "booster[1]"
[10] "0:[f59<-9.53674e-007] yes=1,no=2,missing=1,gain=832.545,cover=788.852"
[11] "1:[f28<-9.53674e-007] yes=3,no=4,missing=3,gain=569.725,cover=768.39"
[12] "3:leaf=0.784718,cover=458.937"
[13] "4:leaf=-0.96853,cover=309.453"
[14] "2:leaf=-6.23624,cover=20.4624"
xgb.plot.tree绘制学得的模型library(DiagrammeR)
xgb.plot.tree(model = bst)
# save model to binary local file
xgb.save(bst, "xgboost.model")
[1] TRUE
# load binary model to R
bst2 <- xgb.load("xgboost.model")
pred2 <- predict(bst2, test$data)
比较原始模型和载入的模型是否完全一致
print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred))))
[1] "sum(abs(pred2-pred))= 0"
file.remove("./xgboost.model")
[1] TRUE